/*
Obtain the gendered amenity classifications and add them to the analysis samples (separate by gender)
	// input: amenities_gender_sample, clauselabels
	// output: amenity_value_gender_fem, amenity_value_gender_mal
*/

cap log close
cap log using "$logs/amenity_value_coeffs_separate_log", replace


*************
**LOAD DATA**
*************

* Sample for data-driven classification of clauses
use "$files\amenities_gender_sample", clear

* PageRank normalizations
gen fe_diff = abs(fe_m-fe_f)
gen pagerank_diff = abs(pagerank_m-pagerank_f)
qui sum fe_diff, d
list estabid fe_m fe_f fe_diff pagerank_m pagerank_f pagerank_diff if fe_diff==r(min)
qui sum pagerank_diff if fe_diff==r(min)
list estabid fe_m fe_f fe_diff pagerank_m pagerank_f pagerank_diff if pagerank_diff==r(min)
gen Diff = pagerank_diff if pagerank_diff==r(min)
gen X = pagerank_m if pagerank_diff==r(min)
gen Ratio = pagerank_m/pagerank_f if pagerank_diff==r(min)
gen B = Diff/(X*(Ratio-1))
gen A = (Diff/X)+B
gen pagerank_m_norm = pagerank_m
sum Ratio
gen pagerank_f_norm = pagerank_f*r(mean)

* PageRanks: logs of exp(V)
gen ln_pagerank_m = log(pagerank_m)
gen ln_pagerank_f = log(pagerank_f)
gen ln_pagerank_m_norm = log(pagerank_m_norm)
gen ln_pagerank_f_norm = log(pagerank_f_norm)

* Inverse hyporbolic sine of clauses
foreach x of varlist *cl_* {
	replace `x' = asinh(`x')
}

* Restrictions
//keep estab with at least 4 sectoral contracts over time frame considered
egen tag_estabid = tag(estabid)
sum cnt, d
tab cnt if tag_estabid==1
keep if cnt>=4
//random sample split
splitsample, generate(sample) nsplit(2) rseed(1234)
sort estabid

* Some descriptives
// gap in page ranks (with and without normalization)
gen gap_pagerank = ln_pagerank_f - ln_pagerank_m
gen gap_pagerank_norm = ln_pagerank_f_norm - ln_pagerank_m_norm
// index ranking of PageRanks (minimum at 0; maximum at 100)
qui sum pagerank_m
gen pagerank_m_index = (pagerank_m-r(min))*(100/(r(max)-r(min)))
qui sum pagerank_f
gen pagerank_f_index = (pagerank_f-r(min))*(100/(r(max)-r(min)))
gen gap_pagerank_index = pagerank_f_index - pagerank_m_index
//correlation of gaps
corr gap_pagerank*

* Clause labeling
//label clauses 
do "$raw/clauselabels.do" 	
//collinearity in clauses
sum cl_43pro_pro cl_51aut_tra	
drop cl_43pro_pro cl_51aut_tra	

* Geography and industry
gen uf = floor(municipality/1e4)
gen region = floor(municipality/1e5)
gen ind1 = floor(cnae20subcl/1e6)
gen ind2 = floor(cnae20subcl/1e5)
gen ind3 = floor(cnae20subcl/1e4)
	
	
*****************************************************
*** select female clauses with data driven method ***
*****************************************************
eststo clear

** LASSO ON VALUE (method1)**
	
	* Adaptive lasso
	lasso linear ln_pagerank_f fe_f cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method1
	matrix b_method1 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m1 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method1, sort(coef, standardized) nofvlabel
	lassogof method1, over(sample) postselection
	qui sum ln_pagerank_f if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum ln_pagerank_f if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe ln_pagerank_f `selected_vars_m1' if sample==1, absorb(uf ind2)
	estimates store method1_ols 
	matrix b_method1_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_fem.xlsx", sheet(method1) replace
	putexcel A1=matrix(b_method1), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method1_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	
	
** LASSO ON VALUE, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear ln_pagerank_f fe_f cl_*, selection(adaptive) rseed(1234)
	estimates store method1_v2
	matrix b_method1 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m1 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_fem.xlsx", sheet(method1_v2) modify
	putexcel A1=matrix(b_method1), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
** LASSO ON NORMALIZED VALUE (method2)**
	
	* Adaptive lasso
	lasso linear ln_pagerank_f_norm fe_f cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method2
	matrix b_method2 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m2 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method2, sort(coef, standardized) nofvlabel
	lassogof method2, over(sample) postselection
	qui sum ln_pagerank_f_norm if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum ln_pagerank_f_norm if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe ln_pagerank_f_norm `selected_vars_m2' if sample==1, absorb(uf ind2)
	estimates store method2_ols 
	matrix b_method2_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_fem.xlsx", sheet(method2) modify
	putexcel A1=matrix(b_method2), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method2_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	

	
** LASSO ON NORMALIZED VALUE, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear ln_pagerank_f_norm fe_f cl_*, selection(adaptive) rseed(1234)
	estimates store method2_v2
	matrix b_method2 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m2 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_fem.xlsx", sheet(method2_v2) modify
	putexcel A1=matrix(b_method2), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
** LASSO ON INDEX (method3)**
	
	* Adaptive lasso
	lasso linear pagerank_f_index fe_f cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method3
	matrix b_method3 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m3 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method3, sort(coef, standardized) nofvlabel
	lassogof method3, over(sample) postselection
	qui sum pagerank_f_index if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum pagerank_f_index if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe pagerank_f_index `selected_vars_m3' if sample==1, absorb(uf ind2)
	estimates store method3_ols 
	matrix b_method3_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_fem.xlsx", sheet(method3) modify
	putexcel A1=matrix(b_method3), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method3_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	

** LASSO ON INDEX, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear pagerank_f_index fe_f cl_*, selection(adaptive) rseed(1234)
	estimates store method3_v2
	matrix b_method3 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m3 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_fem.xlsx", sheet(method3_v2) modify
	putexcel A1=matrix(b_method3), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	
*****************************************************
*** select male clauses with data driven method ***
*****************************************************
eststo clear

** LASSO ON VALUE (method1)**
	
	* Adaptive lasso
	lasso linear ln_pagerank_m fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method1
	matrix b_method1 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m1 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method1, sort(coef, standardized) nofvlabel
	lassogof method1, over(sample) postselection
	qui sum ln_pagerank_m if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum ln_pagerank_m if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe ln_pagerank_m `selected_vars_m1' if sample==1, absorb(uf ind2)
	estimates store method1_ols 
	matrix b_method1_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_mal.xlsx", sheet(method1) replace
	putexcel A1=matrix(b_method1), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method1_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	
	
** LASSO ON VALUE, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear ln_pagerank_m fe_m cl_*, selection(adaptive) rseed(1234)
	estimates store method1_v2
	matrix b_method1 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m1 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_mal.xlsx", sheet(method1_v2) modify
	putexcel A1=matrix(b_method1), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
** LASSO ON NORMALIZED VALUE (method2)**
	* Adaptive lasso
	lasso linear ln_pagerank_m_norm fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method2
	matrix b_method2 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m2 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method2, sort(coef, standardized) nofvlabel
	lassogof method2, over(sample) postselection
	qui sum ln_pagerank_m_norm if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum ln_pagerank_m_norm if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe ln_pagerank_m_norm `selected_vars_m2' if sample==1, absorb(uf ind2)
	estimates store method2_ols 
	matrix b_method2_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_mal.xlsx", sheet(method2) modify
	putexcel A1=matrix(b_method2), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method2_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	

	
** LASSO ON NORMALIZED VALUE, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear ln_pagerank_m_norm fe_m cl_*, selection(adaptive) rseed(1234)
	estimates store method2_v2
	matrix b_method2 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m2 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_mal.xlsx", sheet(method2_v2) modify
	putexcel A1=matrix(b_method2), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
** LASSO ON INDEX (method3)**
	
	* Adaptive lasso
	lasso linear pagerank_m_index fe_m cl_* if sample==1, selection(adaptive) rseed(1234)
	estimates store method3
	matrix b_method3 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m3 "`e(allvars_sel)'"

	* Evaluate
	lassocoef method3, sort(coef, standardized) nofvlabel
	lassogof method3, over(sample) postselection
	qui sum pagerank_m_index if sample==1
	di "Variance in sample==1: `r(Var)'"
	qui sum pagerank_m_index if sample==2
	di "Variance in sample==2: `r(Var)'"
	
	* Post-selection OLS (with geography and industry fixed effects)
	reghdfe pagerank_m_index `selected_vars_m3' if sample==1, absorb(uf ind2)
	estimates store method3_ols 
	matrix b_method3_ols = e(b)'	

	* Export
	putexcel set "$tables\amenity_value_gender_mal.xlsx", sheet(method3) modify
	putexcel A1=matrix(b_method3), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	putexcel D1=matrix(b_method3_ols), names nformat(number_d2)
	putexcel D1 = "Clause"
	putexcel E1 = "Coef (OLS)"	
	

** LASSO ON INDEX, ENTIRE SAMPLE --> just to check the difference **
	
	* Adaptive lasso
	lasso linear pagerank_m_index fe_m cl_*, selection(adaptive) rseed(1234)
	estimates store method3_v2
	matrix b_method3 = e(b_standardized)'
	lassocoef, display(coef, standardized) sort(coef, standardized)
	local selected_vars_m3 "`e(allvars_sel)'"

	* Export
	putexcel set "$tables\amenity_value_gender_mal.xlsx", sheet(method3_v2) modify
	putexcel A1=matrix(b_method3), names nformat(number_d2)
	putexcel A1 = "Clause"
	putexcel B1 = "Coef (standardized)"	
	
	
log close
